.my_button {
background-color: #caa8b1;
color: #3c2a34;
padding: 10px;
font-size: 14px;
border: none;
cursor: pointer;
width: 70px;
}
h1, h2, h3, h4, h5, h6 {
text-align: center;
color: #e0cdd6;
}
body {
background-color: #3c2a34;
}
table {
margin:1em auto;
color: #e0cdd6;
}
project_path = "D:/moje/projekty/chess-in-the-digital-age"
setwd(paste(project_path,"/presentation", sep = ""))
Dataset_path = paste(project_path,"/Dataset", sep = "")
selected_date = "2024-04"
knitr::opts_chunk$set(root= paste(project_path,"/presentation", sep = ""))
library(pander)
panderOptions('digits',7)
library(knitr)
library(rvest)
library(stringi)
library(dplyr)
library(scales)
library(ggplot2)
library(png)
library("patchwork")
library(forcats)
img <- readPNG("../pictures/queens_gambit.png", native = TRUE)
img2 = readPNG("../pictures/candidates_tournament.png", native = TRUE)
img3 = readPNG("../pictures/covid.png", native = TRUE)
img4 = readPNG("../pictures/opening_icon.png", native = TRUE)
url = "https://database.lichess.org"
path= "/html/body/div/div[2]/div/section[1]/table"
wezel = html_node(read_html(url), xpath=path)
const div_my_init = document.getElementById("my_init");
div_init.style.display = 'none';
var button_my_init = document.getElementById("button_my_init");
function hideMy_init() {
if (div_my_init.style.display === 'none') {
div_my_init.style.display = 'block';
button_my_init.innerHTML = "Ukryj";
} else {
div_my_init.style.display = 'none';
button_my_init.innerHTML = "Pokaż";
}
}
# wybor tabeli z iloscia zagranych partii wedlug miesiaca
games_count_html = html_table(wezel)[[3]]
my_games_count <- data.frame(count=as.numeric(stri_replace_all(games_count_html,"",regex = "\\,"))[-length(games_count_html)])
# wybor lat do analizy na podstawie linkow .zst.torrent
hyperlinks = html_nodes(wezel, "a")
my_href = html_attr(hyperlinks, "href")
links = data.frame(links = paste(url,my_href,sep="/"))
init_links_for_download = links[seq(2,nrow(links), by=2), 1]
# wyodrebnianie lat z linkow
my_year_from_imported_links = data.frame(year=data.frame(stri_match_all(data.frame(init_links_for_download),
regex = "rated_\\s*(.*?)\\s*[-]"))[,2])
Data_games_count = cbind(my_year_from_imported_links,my_games_count)
# szukanie indeksow rozpoczynajacych nowy rok w zbiorze
first_index_of_years=c()
for(i in unique(my_year_from_imported_links)[,]){
first_index_of_years = append(first_index_of_years,which(rev(Data_games_count$year)==i)[1])
}
my_plot = ggplot(Data_games_count, aes(x = rev(1:nrow(my_games_count)), y = count)) +
# typ wykresu
geom_bar(stat = "identity",aes(fill = year)) +
# opis
labs(x = "Years", y = "Number of chess games", title = "Number of chess games played on the lichess website") +
# legenda
scale_fill_manual(name = "Years", values = unique(Data_games_count$year)) +
expand_limits(x = c(0, NA), y = c(0,NA)) +
scale_x_continuous(breaks = c(rev(first_index_of_years)),
labels = c(rev(unique(my_year_from_imported_links)[,]))) +
scale_y_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
theme(text = element_text(size = 20),
axis.text.x = element_text(angle = 90, hjust = 1))
my_plot
# strzalka
my_plot + geom_segment(aes(70, 68000000, xend = 86, yend = 62000000),
linewidth=1.2,
arrow = arrow(length = unit(5, "mm"))) +
# zdjecia
inset_element(p = img,
left = 0.02,
bottom = 0.54,
right = 0.42,
top = 0.9) +
inset_element(p = img2,
left = 0.05,
bottom = 0.29,
right = 0.5,
top = 0.52)+
inset_element(p = img3,
left = 0.44,
bottom = 0.69,
right = 0.65,
top = 0.82)
Gambit Królowej
hyperlinks = html_nodes(wezel, "a")
my_href = html_attr(hyperlinks, "href")
pander(head(bind_rows(lapply(html_attrs(hyperlinks), function(x) data.frame(as.list(x), stringsAsFactors=FALSE)))))
| href |
|---|
| standard/lichess_db_standard_rated_2024-05.pgn.zst |
| standard/lichess_db_standard_rated_2024-05.pgn.zst.torrent |
| standard/lichess_db_standard_rated_2024-04.pgn.zst |
| standard/lichess_db_standard_rated_2024-04.pgn.zst.torrent |
| standard/lichess_db_standard_rated_2024-03.pgn.zst |
| standard/lichess_db_standard_rated_2024-03.pgn.zst.torrent |
links = data.frame(links = paste(url,my_href,sep="/"))
pander(head(links))
choose_max_date_to_scrap = function(date){
# co drugi link to .zst.torrent wiec go wyodrebniam
init_links_for_download = links[seq(2,nrow(links), by=2), 1]
full_date_from_imported_links = data.frame(stri_match_all(data.frame(init_links_for_download),
regex = "rated_\\s*(.*?)\\s*[.]"))[,2]
date_location = which(stri_detect(full_date_from_imported_links, regex = date) == TRUE)
my_links_for_download = init_links_for_download[c(date_location:length(init_links_for_download))]
return(my_links_for_download)
}
links_for_download = choose_max_date_to_scrap(selected_date)
pander(data.frame(prepared_links = head(links_for_download)))
full_date_from_imported_links = data.frame(stri_match_all(data.frame(links_for_download),
regex = "rated_\\s*(.*?)\\s*[.]"))[,2]
pander(head(data.frame(full_date_from_imported_links)))
| full_date_from_imported_links |
|---|
| 2024-04 |
| 2024-03 |
| 2024-02 |
| 2024-01 |
| 2023-12 |
| 2023-11 |
year_from_imported_links = data.frame(stri_match_all(data.frame(links_for_download),
regex = "rated_\\s*(.*?)\\s*[-]"))[,2]
pander(head(data.frame(year_from_imported_links)))
| year_from_imported_links |
|---|
| 2024 |
| 2024 |
| 2024 |
| 2024 |
| 2023 |
| 2023 |
month_from_imported_links = data.frame(stri_match_all(data.frame(links_for_download),
regex = "-\\s*(.*?)\\s*[.]"))[,2]
pander(head(data.frame(month_from_imported_links)))
| month_from_imported_links |
|---|
| 04 |
| 03 |
| 02 |
| 01 |
| 12 |
| 11 |
my_month_names = month.abb[as.integer(month_from_imported_links)]
pander(head(data.frame(my_month_names)))
| my_month_names |
|---|
| Apr |
| Mar |
| Feb |
| Jan |
| Dec |
| Nov |
my_paths_year <- unique(paste(Dataset_path,"/",year_from_imported_links, sep=""))
pander(data.frame(my_paths_year))
| my_paths_year |
|---|
| D:/moje/projekty/chess-in-the-digital-age/Dataset/2024 |
| D:/moje/projekty/chess-in-the-digital-age/Dataset/2023 |
| D:/moje/projekty/chess-in-the-digital-age/Dataset/2022 |
| D:/moje/projekty/chess-in-the-digital-age/Dataset/2021 |
| D:/moje/projekty/chess-in-the-digital-age/Dataset/2020 |
| D:/moje/projekty/chess-in-the-digital-age/Dataset/2019 |
| D:/moje/projekty/chess-in-the-digital-age/Dataset/2018 |
| D:/moje/projekty/chess-in-the-digital-age/Dataset/2017 |
| D:/moje/projekty/chess-in-the-digital-age/Dataset/2016 |
| D:/moje/projekty/chess-in-the-digital-age/Dataset/2015 |
| D:/moje/projekty/chess-in-the-digital-age/Dataset/2014 |
| D:/moje/projekty/chess-in-the-digital-age/Dataset/2013 |
create_folders_year = function(){
for(i in 1:length(my_paths_year)){
dir.create(my_paths_year[i])
}
}
my_paths_month <- paste(Dataset_path,"/",year_from_imported_links,"/",month_from_imported_links,". ",my_month_names, sep="")
pander(head(data.frame(my_paths_month)))
| my_paths_month |
|---|
| D:/moje/projekty/chess-in-the-digital-age/Dataset/2024/04. Apr |
| D:/moje/projekty/chess-in-the-digital-age/Dataset/2024/03. Mar |
| D:/moje/projekty/chess-in-the-digital-age/Dataset/2024/02. Feb |
| D:/moje/projekty/chess-in-the-digital-age/Dataset/2024/01. Jan |
| D:/moje/projekty/chess-in-the-digital-age/Dataset/2023/12. Dec |
| D:/moje/projekty/chess-in-the-digital-age/Dataset/2023/11. Nov |
create_folders_month = function(){
for(i in 1:length(my_paths_month)){
dir.create(my_paths_month[i])
}
}
my_paths = paste(Dataset_path,"/",year_from_imported_links,"/",month_from_imported_links,". ",my_month_names,"/lichess_db_standard_rated_",full_date_from_imported_links,".pgn.zst.torrent", sep="")
pander(head(data.frame(my_paths)))
| my_paths |
|---|
| D:/moje/projekty/chess-in-the-digital-age/Dataset/2024/04. Apr/lichess_db_standard_rated_2024-04.pgn.zst.torrent |
| D:/moje/projekty/chess-in-the-digital-age/Dataset/2024/03. Mar/lichess_db_standard_rated_2024-03.pgn.zst.torrent |
| D:/moje/projekty/chess-in-the-digital-age/Dataset/2024/02. Feb/lichess_db_standard_rated_2024-02.pgn.zst.torrent |
| D:/moje/projekty/chess-in-the-digital-age/Dataset/2024/01. Jan/lichess_db_standard_rated_2024-01.pgn.zst.torrent |
| D:/moje/projekty/chess-in-the-digital-age/Dataset/2023/12. Dec/lichess_db_standard_rated_2023-12.pgn.zst.torrent |
| D:/moje/projekty/chess-in-the-digital-age/Dataset/2023/11. Nov/lichess_db_standard_rated_2023-11.pgn.zst.torrent |
downloading = function(){
create_folders_year()
create_folders_month()
for(j in 1:length(my_paths)){
download.file(links_for_download[j], my_paths[j], mode="wb",Sys.sleep(0.2))
}
}
downloading()
const my_div1 = document.getElementById("tworzenie-struktury");
my_div1.style.display = 'none';
var button = document.getElementById("my_button");
function myFunction() {
if (my_div1.style.display === 'none') {
my_div1.style.display = 'block';
button.innerHTML = "Ukryj";
} else {
my_div1.style.display = 'none';
button.innerHTML = "Pokaż";
}
}
choose_max_date_to_scrap_game_count = function(date){
# wybor tabeli do analizy
my_games_count = html_table(wezel)[[3]]
# wybor lat do analizy na podstawie linkow .zst.torrent
init_links_for_download = links[seq(2,nrow(links), by=2), 1]
# wyodrebnianie dat z linkow
full_date_from_imported_links = data.frame(stri_match_all(data.frame(init_links_for_download),
regex = "rated_\\s*(.*?)\\s*[.]"))[,2]
# szukanie indeksow z datami odpowiadajacymi wybranej dacie
date_location = which(stri_detect(full_date_from_imported_links, regex = date) == TRUE)
# wyswietlanie liczb gier dla wybranych dat
games_count = my_games_count[c(date_location:length(init_links_for_download))]
return(games_count)
}
games_count = choose_max_date_to_scrap_game_count(selected_date)
pander(head(data.frame(games_count)))
| games_count |
|---|
| 91,383,489 |
| 95,810,349 |
| 91,628,934 |
| 99,001,912 |
| 96,909,211 |
| 92,389,636 |
preparing_month_dataset = function(date, data_size = 0.001){
# szukanie indeksu wybranej daty
searching_location = which(stri_detect(full_date_from_imported_links, regex = date) == TRUE)
# szukanie liczby meczy dla wybranej daty i ograniczanie jej wzgledem wyszukanych wartosci
decreasing_game_number <- round((as.numeric(stri_replace_all(games_count,"",regex = "\\,"))*18*data_size)[searching_location])
# definiowanie pliku do przerobki na podstawie wybranej daty
pgn_file_to_read = paste(Dataset_path,"/",year_from_imported_links,"/",month_from_imported_links,". ",my_month_names,"/lichess_db_standard_rated_",full_date_from_imported_links,".pgn", sep="")[searching_location]
# wczytanie pliku do zdefiniowanej dlugosci
my_pgn <- read.table(pgn_file_to_read,
quote="", sep="\n", stringsAsFactors=FALSE, nrows = decreasing_game_number)
# szukanie ostatnich 22 nazw kolumn
colnms <- sub("\\[(\\w+).+", "\\1", my_pgn[(decreasing_game_number-22):decreasing_game_number,1])
# szukanie indeksow rozpoczynajacych nowe partie
Event_location = which(stri_detect(colnms, regex = "Event") == TRUE)
# usuwanie danych pod wyszukanym indeksie
my_pgn2 = my_pgn[1:(decreasing_game_number-22+Event_location-2),]
# zapis przetworzonego zbioru do pliku
pgn_file_to_write = paste(Dataset_path,"/",year_from_imported_links,"/",month_from_imported_links,". ",my_month_names,"/Data_",date,".pgn", sep="")[searching_location]
write.table(my_pgn2,pgn_file_to_write,col.names = FALSE,row.names = FALSE,quote = FALSE)
test_path = paste(Dataset_path,"/",year_from_imported_links,"/",month_from_imported_links,". ",my_month_names,"/Data_",date,".pgn", sep="")[searching_location]
# tworzenie pliku dla statusu pracy
file_txt_for_info = paste(Dataset_path,"/",year_from_imported_links,"/",month_from_imported_links,". ",my_month_names,"/ograniczono_dane.txt", sep="")[searching_location]
write.table(" ",file_txt_for_info)
testing_last_char <- read.table(test_path,
quote="", sep="\n", stringsAsFactors=FALSE)
return(tail(testing_last_char))
}
removing_oryginal_dataset = function(date){
# szukanie indeksu wybranej daty
searching_location = which(stri_detect(full_date_from_imported_links, regex = date) == TRUE)
# definiowanie plikow do usuniecia
file_pgn_zst_to_remove = paste(Dataset_path,"/",year_from_imported_links,"/",month_from_imported_links,". ",my_month_names,"/lichess_db_standard_rated_",full_date_from_imported_links,".pgn.zst", sep="")[searching_location]
file_pgn_to_remove = paste(Dataset_path,"/",year_from_imported_links,"/",month_from_imported_links,". ",my_month_names,"/lichess_db_standard_rated_",full_date_from_imported_links,".pgn", sep="")[searching_location]
# tworzenie pliku dla statusu pracy
file_txt_for_info = paste(Dataset_path,"/",year_from_imported_links,"/",month_from_imported_links,". ",my_month_names,"/usunieto_oryginal.txt", sep="")[searching_location]
file.remove(file_pgn_zst_to_remove)
file.remove(file_pgn_to_remove)
write.table(" ",file_txt_for_info)
}
preparing_month_dataset("2013-01")
removing_oryginal_dataset("2013-01")
const my_div2 = document.getElementById("przygotowywanie-zbiorow");
my_div2.style.display = 'none';
var button2 = document.getElementById("my_button2");
function myFunction2() {
if (my_div2.style.display === 'none') {
my_div2.style.display = 'block';
button2.innerHTML = "Ukryj kod";
} else {
my_div2.style.display = 'none';
button2.innerHTML = "Pokaż kod";
}
}
# komenda szukajaca powtarzalnosc danej kolumny
# zmienne = fct_count(fct_infreq(sub("\\[(\\w+).+", "\\1", pgn[1:nrow(pgn.df),1])))
files_to_read = paste(Dataset_path,"/",year_from_imported_links,"/",month_from_imported_links,". ",my_month_names,"/Data_",full_date_from_imported_links,".pgn", sep="")
Percent_games_count=c()
for(i in 1:length(files_to_read)){
pgn <- read.table(files_to_read[i], quote="", sep="\n", stringsAsFactors=FALSE)
pgn.df <- data.frame(matrix(sub("\\[\\w+ \\\"(.+)\\\"\\]", "\\1", pgn[,1]),
byrow=TRUE, ncol=1))
opening_index = which(sub("\\[(\\w+).+", "\\1", pgn[1:nrow(pgn.df),1]) == "Opening")
All_games_count = length(pgn.df[opening_index,])
Queens_Gambit_games_count = length(which(stri_detect(pgn.df[opening_index,], regex = "Queen's Gambit") == TRUE))
Percent_games_count = append(Percent_games_count,(Queens_Gambit_games_count/All_games_count)*100)
}
my_Percent_games_count <- data.frame(count=Percent_games_count)
Data_Queens_gambit_games_count = cbind(data.frame(year=year_from_imported_links),my_Percent_games_count)
ggplot(Data_Queens_gambit_games_count, aes(x = rev(1:length(games_count)), y = count)) +
# typ wykresu
geom_bar(stat = "identity",aes(fill = year)) +
# opis
labs(x = "Years", y = "Number of chess games ( % ) ", title = "Number of 'Queen's Gambit' openings played") +
# legenda
scale_fill_manual(name = "Years", values = unique(Data_Queens_gambit_games_count$year)) +
expand_limits(x = c(0, NA), y = c(0,NA)) +
scale_x_continuous(breaks = c(rev(first_index_of_years)),
labels = c(rev(unique(year_from_imported_links)))) +
scale_y_continuous(labels = unit_format(unit = "%", scale = 1)) +
theme(text = element_text(size = 20),
axis.text.x = element_text(angle = 90, hjust = 1))
files_to_read = paste(Dataset_path,"/",year_from_imported_links,"/",month_from_imported_links,". ",my_month_names,"/Data_",full_date_from_imported_links,".pgn", sep="")
novice_games_count=c()
for(i in 1:length(files_to_read)){
pgn <- read.table(files_to_read[i], quote="", sep="\n", stringsAsFactors=FALSE)
pgn.df <- data.frame(matrix(sub("\\[\\w+ \\\"(.+)\\\"\\]", "\\1", pgn[,1]),
byrow=TRUE, ncol=1))
elo_index = which(sub("\\[(\\w+).+", "\\1", pgn[1:nrow(pgn.df),1]) == "WhiteElo")
All_games_count = length(pgn.df[elo_index,])
my_novice_games_count = length(which(as.numeric(pgn.df[elo_index,]) < 1200) == TRUE)
novice_games_count = append(novice_games_count,(my_novice_games_count/All_games_count)*100)
}
my_Percent_novice_games_count <- data.frame(count=Percent_novice_games_count)
Data_novice_games_count = cbind(data.frame(year=year_from_imported_links),my_Percent_novice_games_count)
ggplot(Data_novice_games_count, aes(x = rev(1:length(games_count)), y = count)) +
# typ wykresu
geom_bar(stat = "identity",aes(fill = year)) +
# opis
labs(x = "Years", y = "Number of chess games ( % ) ", title = "Number of novice games (rank < 1200) ") +
# legenda
scale_fill_manual(name = "Years", values = unique(Data_novice_games_count$year)) +
expand_limits(x = c(0, NA), y = c(0,NA)) +
scale_x_continuous(breaks = c(rev(first_index_of_years)),
labels = c(rev(unique(year_from_imported_links)))) +
scale_y_continuous(labels = unit_format(unit = "%", scale = 1)) +
theme(text = element_text(size = 20),
axis.text.x = element_text(angle = 90, hjust = 1)) +
# strzalka
geom_segment(aes(58, 10.68, xend = 95, yend = 10),
linewidth=1.2,
arrow = arrow(length = unit(5, "mm"))) +
# zdjecia
inset_element(p = img,
left = 0.02,
bottom = 0.54,
right = 0.42,
top = 0.9)
files_to_read = paste(Dataset_path,"/",year_from_imported_links,"/",month_from_imported_links,". ",my_month_names,"/Data_",full_date_from_imported_links,".pgn", sep="")
pro_games_count=c()
for(i in 1:length(files_to_read)){
pgn <- read.table(files_to_read[i], quote="", sep="\n", stringsAsFactors=FALSE)
pgn.df <- data.frame(matrix(sub("\\[\\w+ \\\"(.+)\\\"\\]", "\\1", pgn[,1]),
byrow=TRUE, ncol=1))
elo_index_v2 = which(sub("\\[(\\w+).+", "\\1", pgn[1:nrow(pgn.df),1]) == "WhiteElo")
All_games_count_v2 = length(pgn.df[elo_index_v2,])
my_pro_games_count = length(which(as.numeric(pgn.df[elo_index_v2,]) > 1800) == TRUE)
pro_games_count = append(pro_games_count,(my_pro_games_count/All_games_count_v2)*100)
}
my_Percent_pro_games_count <- data.frame(count=Percent_pro_games_count)
Data_pro_games_count = cbind(data.frame(year=year_from_imported_links),my_Percent_pro_games_count)
ggplot(Data_pro_games_count, aes(x = rev(1:length(games_count)), y = count)) +
# typ wykresu
geom_bar(stat = "identity",aes(fill = year)) +
# opis
labs(x = "Years", y = "Number of chess games ( % ) ", title = "Number of pro games (rank > 1800) ") +
# legenda
scale_fill_manual(name = "Years", values = unique(Data_pro_games_count$year)) +
expand_limits(x = c(0, NA), y = c(0,NA)) +
scale_x_continuous(breaks = c(rev(first_index_of_years)),
labels = c(rev(unique(year_from_imported_links)))) +
scale_y_continuous(labels = unit_format(unit = "%", scale = 1)) +
theme(text = element_text(size = 20),
axis.text.x = element_text(angle = 90, hjust = 1)) +
# strzalka
geom_segment(aes(65, 38, xend = 86, yend = 37),
linewidth=1.2,
arrow = arrow(length = unit(5, "mm"))) +
inset_element(p = img3,
left = 0.39,
bottom = 0.84,
right = 0.6,
top = 0.97)
#files_to_read = paste(Dataset_path,"/",year_from_imported_links,"/",month_from_imported_links,". ",my_month_names,"/Data_",full_date_from_imported_links,".pgn", sep="")
search_by_moves = function(searching_moves){
opening_count=c()
for(i in 1:length(files_to_read)){
pgn <- read.table(files_to_read[i], quote="", sep="\n", stringsAsFactors=FALSE)
moves = which(sub("\\[(\\b+).+", "\\1", pgn[1:nrow(pgn),1]) != "")
All_games_count = length(pgn[moves,])
for(j in 1:length(searching_moves)){
my_regex = paste("(.*?)\\s*",j,". (\\w+).+", sep = "")
moves_to_compare = sub(my_regex, "\\2", pgn[moves,])
moves = moves[which(moves_to_compare == searching_moves[j])]
}
opening_count = append(opening_count,(length(moves)/All_games_count)*100)
}
return(opening_count)
}
search_by_moves(c("e3","d3","Ne2","Nd2","Ng3"))
my_opening_count <- data.frame(count=opening_count)
Data_opening_count = cbind(data.frame(year=year_from_imported_links),my_opening_count)
ggplot(Data_opening_count, aes(x = rev(1:length(games_count)), y = count)) +
# typ wykresu
geom_bar(stat = "identity",aes(fill = year)) +
# opis
labs(x = "Years", y = "Number of chess games", title = "Number of 'Cow' move sequences played") +
# legenda
scale_fill_manual(name = "Years", values = unique(Data_opening_count$year)) +
expand_limits(x = c(0, NA), y = c(0, 10)) +
scale_x_continuous(breaks = c(rev(first_index_of_years)),
labels = c(rev(unique(year_from_imported_links)))) +
theme(text = element_text(size = 20),
axis.text.x = element_text(angle = 90, hjust = 1)) +
# strzalka
geom_segment(aes(80, 3.2, xend = 123, yend = 2.2),
linewidth=1.2,
arrow = arrow(length = unit(5, "mm"))) +
inset_element(p = img4,
left = 0.42,
bottom = 0.34,
right = 0.75,
top = 0.51)